Word embeddings

Imports

Import dependencies


In [1]:
%%bash
ls | grep .csv


emails.csv
emails.csv.zip

In [2]:
# %%bash
# pip3 install bokeh

In [3]:
# built-in libs
import email

# processing libs
import pandas as pd

# display libs
from tqdm import tqdm_notebook

Import data


In [4]:
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)

In [5]:
print(emails_df.shape)
emails_df.head()


(10000, 2)
Out[5]:
file message
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e...
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e...
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e...

In [6]:
emails_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
file       10000 non-null object
message    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB

In [7]:
%time
messages_obj_lst = []
messages_str_lst = []

message_metadata = {}

for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
        else:
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    
    payload = msg.get_payload() # decode=True
    
    messages_obj_lst.append(msg)
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
    #    break

print('messages_obj_lst size: %i' % len(messages_obj_lst))


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.53 µs
messages_obj_lst size: 10000

In [8]:
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)

# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')

In [9]:
emails_df.head()


Out[9]:
file message message_obj payload
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Here is our forecast
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Traveling to have a business meeting takes the...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... test successful. way to go!!!
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Randy, Can you send me a schedule of the salar...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Let's shoot for Tuesday at 11:45.

In [10]:
# del messages_obj_lst
# del messages_str_lst

emails_df.drop('message', axis=1, inplace=True)

In [ ]:


In [11]:
corpus_text = '\n'.join(emails_df[:50000]['payload'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]

In [12]:
def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]

In [ ]:

Modelling


In [13]:
from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)

In [14]:
vectors = model.wv
# del model

In [15]:
vectors['good']


Out[15]:
array([-0.270508  , -0.17306764,  1.6283128 ,  0.0789329 ,  0.31106964,
        0.769532  ,  1.2730443 , -0.8092405 ,  0.7060038 ,  0.86828023,
       -2.6277056 , -1.3929644 ,  0.6448156 , -0.7771182 , -1.6537852 ,
       -0.4743401 , -1.1166382 ,  1.1569368 , -1.1398625 ,  0.80520093,
       -1.7167239 , -1.5579057 ,  0.10402635,  3.0918787 , -0.0558991 ,
       -0.43233722, -2.051206  , -0.66570055,  1.5504636 , -0.2648149 ,
        0.11560618, -0.32946193, -0.372461  , -0.781641  ,  1.0626622 ,
       -0.5553393 ,  0.5192849 ,  2.4005246 ,  0.05692073, -2.3076432 ,
       -1.5484774 , -0.67129016,  1.7084714 ,  0.68807465, -0.2931756 ,
        0.6166011 ,  1.0874461 , -0.32894936, -2.7945118 ,  0.0930008 ,
       -0.46457678,  1.2848035 , -1.4603778 ,  0.22172059, -0.99450624,
       -1.0969896 , -2.3467455 ,  0.4534696 ,  0.4488058 , -0.7499471 ,
        1.3231988 ,  1.7316022 ,  0.3932503 ,  0.06664114,  0.47186232,
        2.9767272 , -0.49195403,  2.0907822 ,  1.1899747 ,  1.2920406 ,
        0.4943122 , -1.3712525 ,  0.35063776, -1.9195726 ,  1.0069174 ,
       -2.0902    , -0.33823916, -1.9204639 , -0.7886482 ,  2.2157645 ,
       -0.8052555 ,  0.6665139 , -1.1551962 , -0.8287558 ,  0.05057469,
        2.554974  , -0.62844616,  1.7241517 , -0.09033989,  0.4090363 ,
        0.8304872 , -0.9974313 ,  0.94597244, -0.65530026,  0.4551282 ,
        0.7032987 , -0.13521333, -1.385317  , -0.15468638,  0.7746631 ],
      dtype=float32)

In [16]:
print(vectors.similarity('you', 'your'))
print(vectors.similarity('you', 'internet'))


0.4465897
0.33204436
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [17]:
vectors.most_similar('kill')


/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):
Out[17]:
[('correspondence', 0.7597194910049438),
 ('havestructured', 0.7563828825950623),
 ('throw', 0.7550570964813232),
 ('=with', 0.7486575841903687),
 ('refight', 0.7483857870101929),
 ('thinkof', 0.7468332648277283),
 ('do.>', 0.7420369386672974),
 ('visitors', 0.741276204586029),
 ('nik', 0.7395448684692383),
 ('pleaded', 0.7374611496925354)]

In [18]:
len(model.wv.vocab)


Out[18]:
52543

In [19]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
# print(ordered_terms)
# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(model.wv.syn0norm[term_indices, :], index=ordered_terms)

word_vectors.head(3)


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:13: DeprecationWarning: Call to deprecated `syn0norm` (Attribute will be removed in 4.0.0, use self.wv.vectors_norm instead).
  del sys.path[0]
Out[19]:
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
0.103211 -0.001504 0.051223 0.082046 -0.015313 -0.031139 0.239088 -0.023072 0.260240 0.007476 ... -0.152058 -0.074253 -0.112685 -0.128899 -0.013398 -0.003880 0.017925 -0.033080 0.026232 0.085833
the 0.113068 0.025233 0.103621 0.153182 0.158703 -0.112839 0.170684 -0.030864 -0.080816 0.092015 ... 0.139859 -0.013526 -0.067141 -0.132761 0.028204 0.078747 0.030681 -0.142466 0.065911 -0.094258
to -0.170276 0.023519 -0.007925 0.020925 0.152509 -0.023285 0.163287 -0.003122 -0.051574 -0.069945 ... 0.040197 0.071331 0.091307 -0.199199 -0.065438 0.018316 -0.014435 -0.107577 0.190768 -0.028655

3 rows × 100 columns


In [20]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in model.most_similar(positive=[token], topn=topn):
        print (word, round(similarity, 3))

In [21]:
get_related_terms(u'illegal')


societe 0.8
criminal 0.795
prohibited.neither 0.785
disclosureby> 0.784
representations 0.779
intensify 0.771
>others 0.769
asinformation 0.767
unauthorized 0.767
byothers 0.764
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [22]:
get_related_terms(u'killed')


nero 0.861
howthis 0.824
al,i 0.824
beleivehowthis 0.815
rivera 0.813
after-what 0.807
disasterthanksgiving 0.797
mullick 0.795
asti 0.793
$175.00 0.792
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [23]:
get_related_terms(u'contract')


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):
bridge 0.751
agreement 0.743
transaction 0.737
partnerwill 0.725
unit 0.717
dead 0.712
maximum 0.709
package 0.707
fee 0.705
bond 0.701

In [24]:
get_related_terms(u'fired')


nat 0.731
natural 0.697
swap 0.692
disaster 0.669
strip 0.661
vpenanat 0.654
curve.3 0.647
emit 0.643
year> 0.639
projs 0.636
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [25]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print(term)

In [26]:
word_algebra(add=[u'i', u'will'])


plans
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [27]:
word_algebra(add=[u'you', u'will'])


them
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [28]:
word_algebra(add=[u'i', u'am'])


i'm
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [29]:
word_algebra(add=[u'mother', u'fuck'])


<jmcvey@exhibitworks.com>
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  import sys
/usr/local/lib/python3.5/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [ ]:


In [30]:
from sklearn.manifold import TSNE

In [31]:
tsne_input = word_vectors
tsne_input = tsne_input.head(5000)

In [32]:
tsne_input[:2]


Out[32]:
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
0.103211 -0.001504 0.051223 0.082046 -0.015313 -0.031139 0.239088 -0.023072 0.260240 0.007476 ... -0.152058 -0.074253 -0.112685 -0.128899 -0.013398 -0.003880 0.017925 -0.033080 0.026232 0.085833
the 0.113068 0.025233 0.103621 0.153182 0.158703 -0.112839 0.170684 -0.030864 -0.080816 0.092015 ... 0.139859 -0.013526 -0.067141 -0.132761 0.028204 0.078747 0.030681 -0.142466 0.065911 -0.094258

2 rows × 100 columns


In [33]:
%%time
tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)


CPU times: user 1min 27s, sys: 6.4 s, total: 1min 33s
Wall time: 1min 33s

In [34]:
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

tsne_vectors.head()


Out[34]:
x_coord y_coord
-43.682186 33.196354
the 60.936298 -31.871368
to 55.425861 -30.385191
and 33.788177 10.613073
of 35.482513 7.884809

In [35]:
tsne_vectors[u'word'] = tsne_vectors.index

In [36]:
tsne_vectors.head()


Out[36]:
x_coord y_coord word
-43.682186 33.196354
the 60.936298 -31.871368 the
to 55.425861 -30.385191 to
and 33.788177 10.613073 and
of 35.482513 7.884809 of

In [37]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()


Loading BokehJS ...

In [38]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);



In [ ]:


In [ ]:


Bibliography


In [ ]: